library(dplyr)
library(MASS)
library(ggplot2)
library(GGally)
library(leaps)
library(tidyverse)
library(performance)
library(patchwork)
library(interactions)
library(modelr)
library(caret)
crime_df =
read_csv("cdi.csv") %>%
mutate(
region = factor(region),
pop_density = pop / area,
beds_density_1000 = beds / pop * 1000,
docs_density = docs / pop,
crm_1000 = 1000 * crimes / pop
) %>%
dplyr::select(-crimes, -id,-area, -pop, -beds, -docs) %>%
dplyr::select(crm_1000, everything()) %>%
drop_na()
First examine the marginal distributions of each variable.
#Load data
crime_df1 =
read_csv("cdi.csv") %>%
mutate(
region = factor(region),
pop_density = pop / area,
crm_1000 = 1000 * crimes / pop
) %>%
dplyr::select(-pop, -id, -cty, -state, -area, -crimes)
To begin with, we take a look on the distribution of crime rate per 1000 population. We find that Kings County in NY is an extreme outlier with close to 300 crime rate, so we may not include this data in the following analysis.
First examine the marginal distributions of each variable.
# graph
compute_plist = function(x) {
if (is.factor(x)) {
pl = crime_df1 %>%
ggplot(aes(x = x, y = crm_1000)) +
geom_boxplot() +
labs(
x = '',
y = "Crime Rate",
)
} else {
pl = crime_df1 %>%
ggplot(aes(x = x, y = crm_1000)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
x = '',
y = "Crime Rate",
)
}
return(pl)
}
plist = crime_df1 %>%
dplyr::select(-crm_1000) %>%
map(compute_plist)
labels = crime_df1 %>%
dplyr::select(-crm_1000) %>%
names()
for (i in 1:length(plist)) {
plist[[i]] = plist[[i]] + labs(x = labels[[i]])
}
wrap_plots(plist, ncol = 4)
Compare p-value of all variables
compute_p_value = function(x) {
lm_data = lm(crm_1000 ~ x, data = crime_df1)
if (is.factor(x)) {
return( lm_data %>% broom::tidy() %>% dplyr::select(p.value) %>% .[2:4,1])
}
return(lm_data %>% broom::tidy() %>% dplyr::select(p.value) %>% .[2,1])
}
labels_list <- c("pop18","pop65","docs","beds","hsgrad","bagrad","poverty","unemp","pcincome","totalinc","region_1","region_2", "region_3", "pop_density")
p_values = crime_df1 %>%
dplyr::select(-crm_1000) %>%
map_df(compute_p_value) %>%
mutate(Variables = labels_list)
p_values
## # A tibble: 14 × 2
## p.value Variables
## <dbl> <chr>
## 1 5.75e- 5 pop18
## 2 1.64e- 1 pop65
## 3 4.31e-11 docs
## 4 1.44e-17 beds
## 5 1.60e- 6 hsgrad
## 6 4.23e- 1 bagrad
## 7 8.92e-26 poverty
## 8 3.81e- 1 unemp
## 9 9.27e- 2 pcincome
## 10 1.32e- 6 totalinc
## 11 4.06e- 3 region_1
## 12 6.09e-19 region_2
## 13 2.33e- 7 region_3
## 14 8.66e-27 pop_density
crime_df2 =
read_csv("cdi.csv") %>%
mutate(
region = factor(region),
pop_density = pop / area,
crm_1000 = 1000 * crimes / pop,
beds_density = beds / pop,
docs_density = docs / pop
) %>%
dplyr::select(-crimes, -id, -area, -pop, -cty, -state, -beds, -docs, -region) %>%
dplyr::select(crm_1000, everything())
## Rows: 440 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cty, state
## dbl (15): id, area, pop, pop18, pop65, docs, beds, crimes, hsgrad, bagrad, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# fit regression using all predictors
mult.fit = lm(crm_1000 ~ ., data = crime_df2)
step(mult.fit, direction = 'backward')
## Start: AIC=2659.33
## crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + pop_density + beds_density + docs_density
##
## Df Sum of Sq RSS AIC
## - bagrad 1 0.3 175634 2657.3
## - docs_density 1 5.4 175640 2657.3
## - pcincome 1 98.1 175732 2657.6
## - pop65 1 175.9 175810 2657.8
## - pop18 1 350.4 175985 2658.2
## - hsgrad 1 604.4 176239 2658.8
## <none> 175634 2659.3
## - unemp 1 1376.7 177011 2660.8
## - beds_density 1 1776.4 177411 2661.8
## - totalinc 1 3163.2 178797 2665.2
## - poverty 1 20581.5 196216 2706.1
## - pop_density 1 30494.9 206129 2727.8
##
## Step: AIC=2657.33
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome +
## totalinc + pop_density + beds_density + docs_density
##
## Df Sum of Sq RSS AIC
## - docs_density 1 5.1 175640 2655.3
## - pop65 1 175.6 175810 2655.8
## - pcincome 1 195.1 175830 2655.8
## - pop18 1 470.2 176105 2656.5
## <none> 175634 2657.3
## - hsgrad 1 868.0 176502 2657.5
## - unemp 1 1499.7 177134 2659.1
## - beds_density 1 1932.7 177567 2660.1
## - totalinc 1 3188.6 178823 2663.2
## - poverty 1 26123.4 201758 2716.3
## - pop_density 1 30577.1 206212 2725.9
##
## Step: AIC=2655.34
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome +
## totalinc + pop_density + beds_density
##
## Df Sum of Sq RSS AIC
## - pop65 1 175.5 175815 2653.8
## - pcincome 1 204.3 175844 2653.8
## - pop18 1 472.0 176112 2654.5
## <none> 175640 2655.3
## - hsgrad 1 864.2 176504 2655.5
## - unemp 1 1495.7 177135 2657.1
## - totalinc 1 3183.5 178823 2661.2
## - beds_density 1 3218.9 178859 2661.3
## - poverty 1 26501.7 202141 2715.2
## - pop_density 1 30736.7 206376 2724.3
##
## Step: AIC=2653.78
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + pcincome + totalinc +
## pop_density + beds_density
##
## Df Sum of Sq RSS AIC
## - pcincome 1 227.1 176042 2652.3
## <none> 175815 2653.8
## - hsgrad 1 992.1 176807 2654.3
## - pop18 1 1252.2 177067 2654.9
## - unemp 1 1656.9 177472 2655.9
## - beds_density 1 3119.9 178935 2659.5
## - totalinc 1 3153.5 178969 2659.6
## - poverty 1 29316.3 205131 2719.6
## - pop_density 1 30562.1 206377 2722.3
##
## Step: AIC=2652.35
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + pop_density +
## beds_density
##
## Df Sum of Sq RSS AIC
## <none> 176042 2652.3
## - pop18 1 1110 177153 2653.1
## - hsgrad 1 1331 177373 2653.7
## - unemp 1 1609 177651 2654.3
## - beds_density 1 3580 179622 2659.2
## - totalinc 1 4257 180299 2660.9
## - poverty 1 33494 209537 2727.0
## - pop_density 1 35476 211518 2731.1
##
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc +
## pop_density + beds_density, data = crime_df2)
##
## Coefficients:
## (Intercept) pop18 hsgrad poverty unemp
## -1.618e+01 4.235e-01 4.017e-01 2.912e+00 -1.079e+00
## totalinc pop_density beds_density
## 2.560e-04 4.477e-03 1.629e+03
step(mult.fit, direction = 'forward')
## Start: AIC=2659.33
## crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + pop_density + beds_density + docs_density
##
## Call:
## lm(formula = crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty +
## unemp + pcincome + totalinc + pop_density + beds_density +
## docs_density, data = crime_df2)
##
## Coefficients:
## (Intercept) pop18 pop65 hsgrad bagrad
## -1.192e+01 3.427e-01 -2.230e-01 3.368e-01 8.503e-03
## poverty unemp pcincome totalinc pop_density
## 2.970e+00 -1.048e+00 2.649e-04 2.363e-04 4.391e-03
## beds_density docs_density
## 1.765e+03 -1.299e+02
step(mult.fit, direction = 'both')
## Start: AIC=2659.33
## crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp +
## pcincome + totalinc + pop_density + beds_density + docs_density
##
## Df Sum of Sq RSS AIC
## - bagrad 1 0.3 175634 2657.3
## - docs_density 1 5.4 175640 2657.3
## - pcincome 1 98.1 175732 2657.6
## - pop65 1 175.9 175810 2657.8
## - pop18 1 350.4 175985 2658.2
## - hsgrad 1 604.4 176239 2658.8
## <none> 175634 2659.3
## - unemp 1 1376.7 177011 2660.8
## - beds_density 1 1776.4 177411 2661.8
## - totalinc 1 3163.2 178797 2665.2
## - poverty 1 20581.5 196216 2706.1
## - pop_density 1 30494.9 206129 2727.8
##
## Step: AIC=2657.33
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome +
## totalinc + pop_density + beds_density + docs_density
##
## Df Sum of Sq RSS AIC
## - docs_density 1 5.1 175640 2655.3
## - pop65 1 175.6 175810 2655.8
## - pcincome 1 195.1 175830 2655.8
## - pop18 1 470.2 176105 2656.5
## <none> 175634 2657.3
## - hsgrad 1 868.0 176502 2657.5
## - unemp 1 1499.7 177134 2659.1
## + bagrad 1 0.3 175634 2659.3
## - beds_density 1 1932.7 177567 2660.1
## - totalinc 1 3188.6 178823 2663.2
## - poverty 1 26123.4 201758 2716.3
## - pop_density 1 30577.1 206212 2725.9
##
## Step: AIC=2655.34
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome +
## totalinc + pop_density + beds_density
##
## Df Sum of Sq RSS AIC
## - pop65 1 175.5 175815 2653.8
## - pcincome 1 204.3 175844 2653.8
## - pop18 1 472.0 176112 2654.5
## <none> 175640 2655.3
## - hsgrad 1 864.2 176504 2655.5
## - unemp 1 1495.7 177135 2657.1
## + docs_density 1 5.1 175634 2657.3
## + bagrad 1 0.0 175640 2657.3
## - totalinc 1 3183.5 178823 2661.2
## - beds_density 1 3218.9 178859 2661.3
## - poverty 1 26501.7 202141 2715.2
## - pop_density 1 30736.7 206376 2724.3
##
## Step: AIC=2653.78
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + pcincome + totalinc +
## pop_density + beds_density
##
## Df Sum of Sq RSS AIC
## - pcincome 1 227.1 176042 2652.3
## <none> 175815 2653.8
## - hsgrad 1 992.1 176807 2654.3
## - pop18 1 1252.2 177067 2654.9
## + pop65 1 175.5 175640 2655.3
## + docs_density 1 5.0 175810 2655.8
## + bagrad 1 0.5 175815 2655.8
## - unemp 1 1656.9 177472 2655.9
## - beds_density 1 3119.9 178935 2659.5
## - totalinc 1 3153.5 178969 2659.6
## - poverty 1 29316.3 205131 2719.6
## - pop_density 1 30562.1 206377 2722.3
##
## Step: AIC=2652.35
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + pop_density +
## beds_density
##
## Df Sum of Sq RSS AIC
## <none> 176042 2652.3
## - pop18 1 1110 177153 2653.1
## - hsgrad 1 1331 177373 2653.7
## + pcincome 1 227 175815 2653.8
## + pop65 1 198 175844 2653.8
## + bagrad 1 114 175929 2654.1
## + docs_density 1 17 176025 2654.3
## - unemp 1 1609 177651 2654.3
## - beds_density 1 3580 179622 2659.2
## - totalinc 1 4257 180299 2660.9
## - poverty 1 33494 209537 2727.0
## - pop_density 1 35476 211518 2731.1
##
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc +
## pop_density + beds_density, data = crime_df2)
##
## Coefficients:
## (Intercept) pop18 hsgrad poverty unemp
## -1.618e+01 4.235e-01 4.017e-01 2.912e+00 -1.079e+00
## totalinc pop_density beds_density
## 2.560e-04 4.477e-03 1.629e+03
mat = as.matrix(crime_df2)
# Printing the 2 best models of each size, using the Cp criterion:
leaps(x = mat[,2:12], y = mat[,1], nbest = 2, method = "Cp")
## $which
## 1 2 3 4 5 6 7 8 9 A B
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 1 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## 3 FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## 3 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## 4 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## 5 FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 5 FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## 6 FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 6 TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 7 TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 7 FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 8 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 8 TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 9 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 9 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 10 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $label
## [1] "(Intercept)" "1" "2" "3" "4"
## [6] "5" "6" "7" "8" "9"
## [11] "A" "B"
##
## $size
## [1] 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12
##
## $Cp
## [1] 178.523611 185.054464 43.329279 121.797655 22.962369 24.812950
## [7] 14.542755 15.594180 9.242835 9.247756 5.700027 6.237706
## [13] 4.994133 5.444228 6.440778 6.510977 8.013131 8.242887
## [19] 10.000678 10.013108 12.000000
# Printing the 2 best models of each size, using the adjusted R^2 criterion:
leaps(x = mat[,2:12], y = mat[,1], nbest = 2, method = "adjr2")
## $which
## 1 2 3 4 5 6 7 8 9 A B
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 1 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## 3 FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## 3 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## 4 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## 5 FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 5 FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## 6 FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 6 TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 7 TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 7 FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 8 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 8 TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 9 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 9 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 10 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $label
## [1] "(Intercept)" "1" "2" "3" "4"
## [6] "5" "6" "7" "8" "9"
## [11] "A" "B"
##
## $size
## [1] 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12
##
## $adjr2
## [1] 0.2290554 0.2208622 0.3998009 0.3011339 0.4266132 0.4242809 0.4384570
## [8] 0.4371289 0.4464056 0.4463994 0.4521611 0.4514787 0.4543347 0.4537622
## [15] 0.4537742 0.4536847 0.4530503 0.4527567 0.4517914 0.4517754 0.4505114
# Function regsubsets() performs a subset selection by identifying the "best" model that contains
# a certain number of predictors. By default "best" is chosen using SSE/RSS (smaller is better)
b = regsubsets(crm_1000 ~ ., data = crime_df2)
rs = summary(b)
# plot of Cp and Adj-R2 as functions of parameters
par(mfrow=c(1,2))
plot(2:9, rs$cp, xlab="No of parameters", ylab="Cp Statistic")
abline(0,1)
#6
plot(2:9, rs$adjr2, xlab="No of parameters", ylab="Adj R2")
#6
Model_backward =
lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)
summary(Model_backward)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome +
## totalinc + region + pop_density + beds_density_1000, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.29 -11.31 -0.85 10.43 76.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.658e+01 2.462e+01 -2.704 0.007114 **
## pop18 8.036e-01 2.861e-01 2.808 0.005208 **
## hsgrad 5.786e-01 2.641e-01 2.191 0.028998 *
## bagrad -6.045e-01 2.831e-01 -2.135 0.033329 *
## poverty 2.061e+00 3.618e-01 5.696 2.28e-08 ***
## pcincome 1.102e-03 4.714e-04 2.338 0.019866 *
## totalinc 1.996e-04 7.622e-05 2.619 0.009136 **
## region2 9.145e+00 2.668e+00 3.428 0.000668 ***
## region3 2.710e+01 2.533e+00 10.699 < 2e-16 ***
## region4 2.127e+01 3.080e+00 6.906 1.81e-11 ***
## pop_density 4.941e-03 4.494e-04 10.995 < 2e-16 ***
## beds_density_1000 2.484e+00 5.042e-01 4.926 1.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.88 on 428 degrees of freedom
## Multiple R-squared: 0.5827, Adjusted R-squared: 0.572
## F-statistic: 54.33 on 11 and 428 DF, p-value: < 2.2e-16
BIC(Model_backward)
## [1] 3853.23
Model_forward =
lm(formula = crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp + pcincome + totalinc + region + pop_density + beds_density_1000 + docs_density, data = crime_df)
BIC(Model_forward)
## [1] 3869.53
summary(Model_forward)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty +
## unemp + pcincome + totalinc + region + pop_density + beds_density_1000 +
## docs_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.570 -11.549 -0.911 10.417 75.670
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.862e+01 2.757e+01 -2.489 0.01319 *
## pop18 7.160e-01 3.327e-01 2.152 0.03198 *
## pop65 -1.993e-01 3.072e-01 -0.649 0.51695
## hsgrad 6.074e-01 2.706e-01 2.245 0.02529 *
## bagrad -4.964e-01 2.988e-01 -1.662 0.09733 .
## poverty 1.879e+00 3.887e-01 4.836 1.86e-06 ***
## unemp 6.015e-01 5.345e-01 1.125 0.26109
## pcincome 1.029e-03 4.846e-04 2.123 0.03432 *
## totalinc 2.072e-04 7.652e-05 2.707 0.00705 **
## region2 9.098e+00 2.747e+00 3.312 0.00101 **
## region3 2.785e+01 2.673e+00 10.419 < 2e-16 ***
## region4 2.160e+01 3.140e+00 6.879 2.16e-11 ***
## pop_density 5.006e-03 4.536e-04 11.037 < 2e-16 ***
## beds_density_1000 3.138e+00 7.988e-01 3.928 0.00010 ***
## docs_density -6.554e+02 1.025e+03 -0.639 0.52298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.9 on 425 degrees of freedom
## Multiple R-squared: 0.5845, Adjusted R-squared: 0.5708
## F-statistic: 42.71 on 14 and 425 DF, p-value: < 2.2e-16
Model_stepwise =
lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)
summary(Model_stepwise)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome +
## totalinc + region + pop_density + beds_density_1000, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.29 -11.31 -0.85 10.43 76.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.658e+01 2.462e+01 -2.704 0.007114 **
## pop18 8.036e-01 2.861e-01 2.808 0.005208 **
## hsgrad 5.786e-01 2.641e-01 2.191 0.028998 *
## bagrad -6.045e-01 2.831e-01 -2.135 0.033329 *
## poverty 2.061e+00 3.618e-01 5.696 2.28e-08 ***
## pcincome 1.102e-03 4.714e-04 2.338 0.019866 *
## totalinc 1.996e-04 7.622e-05 2.619 0.009136 **
## region2 9.145e+00 2.668e+00 3.428 0.000668 ***
## region3 2.710e+01 2.533e+00 10.699 < 2e-16 ***
## region4 2.127e+01 3.080e+00 6.906 1.81e-11 ***
## pop_density 4.941e-03 4.494e-04 10.995 < 2e-16 ***
## beds_density_1000 2.484e+00 5.042e-01 4.926 1.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.88 on 428 degrees of freedom
## Multiple R-squared: 0.5827, Adjusted R-squared: 0.572
## F-statistic: 54.33 on 11 and 428 DF, p-value: < 2.2e-16
BIC(Model_stepwise)
## [1] 3853.23
cp1 =
lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + pop_density + docs_density, data = crime_df2)
BIC(cp1)
## [1] 3943.759
summary(cp1)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc +
## pop_density + docs_density, data = crime_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.162 -12.313 -1.754 13.102 69.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.927e+00 2.003e+01 -0.196 0.84467
## pop18 3.158e-01 2.575e-01 1.227 0.22066
## hsgrad 3.165e-01 2.253e-01 1.405 0.16083
## poverty 3.097e+00 3.099e-01 9.993 < 2e-16 ***
## unemp -1.280e+00 5.357e-01 -2.390 0.01726 *
## totalinc 2.283e-04 8.001e-05 2.853 0.00454 **
## pop_density 4.437e-03 4.928e-04 9.005 < 2e-16 ***
## docs_density 1.577e+03 7.173e+02 2.198 0.02844 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.28 on 432 degrees of freedom
## Multiple R-squared: 0.4582, Adjusted R-squared: 0.4494
## F-statistic: 52.19 on 7 and 432 DF, p-value: < 2.2e-16
cp2 =
lm(formula = crm_1000 ~ pop65 + hsgrad + poverty + unemp + totalinc + pop_density + docs_density, data = crime_df2)
BIC(cp2)
## [1] 3944.684
summary(cp2)
##
## Call:
## lm(formula = crm_1000 ~ pop65 + hsgrad + poverty + unemp + totalinc +
## pop_density + docs_density, data = crime_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.172 -12.499 -1.668 12.860 69.984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.666e+00 2.177e+01 0.260 0.79474
## pop65 -2.050e-01 2.660e-01 -0.771 0.44132
## hsgrad 3.365e-01 2.284e-01 1.473 0.14139
## poverty 3.132e+00 3.117e-01 10.048 < 2e-16 ***
## unemp -1.327e+00 5.350e-01 -2.480 0.01353 *
## totalinc 2.274e-04 8.016e-05 2.837 0.00477 **
## pop_density 4.489e-03 4.911e-04 9.140 < 2e-16 ***
## docs_density 1.732e+03 7.253e+02 2.388 0.01737 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.3 on 432 degrees of freedom
## Multiple R-squared: 0.457, Adjusted R-squared: 0.4482
## F-statistic: 51.95 on 7 and 432 DF, p-value: < 2.2e-16
#linear regression model with full predictors
fit_no_inte = lm(crm_1000 ~ pop18 +totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density, data = crime_df)
summary(fit_no_inte)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad +
## beds_density_1000 + docs_density + region + pop_density,
## data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.536 -11.453 -0.606 10.089 75.406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.471e+01 1.601e+01 -1.543 0.123503
## pop18 4.176e-01 2.331e-01 1.792 0.073901 .
## totalinc 2.473e-04 7.250e-05 3.411 0.000708 ***
## poverty 1.607e+00 3.141e-01 5.116 4.71e-07 ***
## hsgrad 3.248e-01 2.025e-01 1.604 0.109452
## beds_density_1000 2.921e+00 7.242e-01 4.034 6.48e-05 ***
## docs_density -5.132e+02 9.134e+02 -0.562 0.574470
## region2 9.001e+00 2.625e+00 3.429 0.000664 ***
## region3 2.586e+01 2.496e+00 10.364 < 2e-16 ***
## region4 2.071e+01 3.074e+00 6.736 5.24e-11 ***
## pop_density 5.145e-03 4.424e-04 11.629 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.97 on 429 degrees of freedom
## Multiple R-squared: 0.5772, Adjusted R-squared: 0.5674
## F-statistic: 58.57 on 10 and 429 DF, p-value: < 2.2e-16
fit_no = lm(crm_1000 ~ pop18 +totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density + pcincome, data = crime_df)
small = lm(crm_1000 ~ pop18 +totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density , data = crime_df)
anova(small, fit_no)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density + pcincome
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 429 138602
## 2 428 137872 1 729.97 2.2661 0.133
Next using partial ANOVA to test whether large model is superior
no pop_density
fit_1 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region, data = crime_df)
summary(fit_1)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad +
## beds_density_1000 + docs_density + region, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.758 -10.480 -0.863 9.935 217.191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.430e+01 1.831e+01 -0.781 0.4352
## pop18 6.297e-01 2.662e-01 2.366 0.0184 *
## totalinc 4.760e-04 7.994e-05 5.955 5.42e-09 ***
## poverty 1.844e+00 3.590e-01 5.137 4.25e-07 ***
## hsgrad 1.173e-01 2.311e-01 0.508 0.6119
## beds_density_1000 2.541e+00 8.287e-01 3.067 0.0023 **
## docs_density 1.688e+03 1.024e+03 1.649 0.0998 .
## region2 6.283e+00 2.995e+00 2.098 0.0365 *
## region3 2.172e+01 2.829e+00 7.676 1.12e-13 ***
## region4 1.583e+01 3.489e+00 4.538 7.38e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.59 on 430 degrees of freedom
## Multiple R-squared: 0.444, Adjusted R-squared: 0.4323
## F-statistic: 38.15 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_1, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 182292
## 2 429 138602 1 43690 135.23 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reject H0, large model is superior
fit_2 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + docs_density + pop_density, data = crime_df)
summary(fit_2)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad +
## beds_density_1000 + docs_density + pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.71 -12.33 -2.50 12.46 68.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.758e+01 1.746e+01 -2.153 0.03190 *
## pop18 5.140e-01 2.572e-01 1.998 0.04631 *
## totalinc 2.523e-04 8.098e-05 3.115 0.00196 **
## poverty 2.798e+00 3.176e-01 8.809 < 2e-16 ***
## hsgrad 5.517e-01 2.137e-01 2.582 0.01015 *
## beds_density_1000 1.805e+00 7.544e-01 2.393 0.01713 *
## docs_density 2.276e+02 9.981e+02 0.228 0.81972
## pop_density 4.433e-03 4.927e-04 8.997 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.28 on 432 degrees of freedom
## Multiple R-squared: 0.4582, Adjusted R-squared: 0.4494
## F-statistic: 52.19 on 7 and 432 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_2, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 432 177630
## 2 429 138602 3 39028 40.266 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reject H0, large model is superior
no docs_density
fit_3 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + region + pop_density, data = crime_df)
summary(fit_3)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad +
## beds_density_1000 + region + pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.426 -11.302 -0.614 10.041 75.789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.218e+01 1.535e+01 -1.445 0.149295
## pop18 3.911e-01 2.281e-01 1.715 0.087128 .
## totalinc 2.406e-04 7.146e-05 3.367 0.000827 ***
## poverty 1.624e+00 3.124e-01 5.197 3.13e-07 ***
## hsgrad 3.007e-01 1.977e-01 1.521 0.129104
## beds_density_1000 2.626e+00 4.976e-01 5.277 2.08e-07 ***
## region2 9.246e+00 2.587e+00 3.574 0.000391 ***
## region3 2.588e+01 2.493e+00 10.378 < 2e-16 ***
## region4 2.056e+01 3.061e+00 6.718 5.85e-11 ***
## pop_density 5.094e-03 4.325e-04 11.777 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.96 on 430 degrees of freedom
## Multiple R-squared: 0.5769, Adjusted R-squared: 0.5681
## F-statistic: 65.15 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_3, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## region + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 138704
## 2 429 138602 1 102.01 0.3157 0.5745
Fail to reject H0, large model is not superior, we should keep the smaller one.
no beds_density
fit_4 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + region + pop_density + docs_density, data = crime_df)
summary(fit_4)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad +
## region + pop_density + docs_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.950 -11.572 -0.876 10.949 79.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.410e+01 1.607e+01 -0.877 0.380758
## pop18 2.357e-01 2.327e-01 1.013 0.311786
## totalinc 2.153e-04 7.333e-05 2.936 0.003508 **
## poverty 2.010e+00 3.031e-01 6.631 1.00e-10 ***
## hsgrad 2.801e-01 2.058e-01 1.361 0.174175
## region2 1.074e+01 2.635e+00 4.078 5.42e-05 ***
## region3 2.566e+01 2.539e+00 10.106 < 2e-16 ***
## region4 1.784e+01 3.044e+00 5.862 9.10e-09 ***
## pop_density 5.065e-03 4.498e-04 11.260 < 2e-16 ***
## docs_density 2.162e+03 6.392e+02 3.382 0.000785 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.29 on 430 degrees of freedom
## Multiple R-squared: 0.5612, Adjusted R-squared: 0.552
## F-statistic: 61.1 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_4, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + region + pop_density +
## docs_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 143860
## 2 429 138602 1 5258.4 16.276 6.483e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reject H0, large model is superior
no hsgrad
fit_5 = lm(crm_1000 ~ pop18 + totalinc + poverty + region + pop_density + docs_density + beds_density_1000, data = crime_df)
summary(fit_5)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + region +
## pop_density + docs_density + beds_density_1000, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.333 -11.471 -0.716 10.299 73.846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.209e+00 6.470e+00 -0.187 0.851804
## pop18 5.475e-01 2.190e-01 2.501 0.012769 *
## totalinc 2.403e-04 7.250e-05 3.314 0.000998 ***
## poverty 1.262e+00 2.292e-01 5.505 6.37e-08 ***
## region2 1.001e+01 2.553e+00 3.922 0.000102 ***
## region3 2.631e+01 2.484e+00 10.591 < 2e-16 ***
## region4 2.227e+01 2.922e+00 7.619 1.64e-13 ***
## pop_density 5.083e-03 4.415e-04 11.511 < 2e-16 ***
## docs_density -2.024e+02 8.942e+02 -0.226 0.821077
## beds_density_1000 2.858e+00 7.244e-01 3.945 9.31e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.01 on 430 degrees of freedom
## Multiple R-squared: 0.5747, Adjusted R-squared: 0.5658
## F-statistic: 64.56 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_5, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + region + pop_density +
## docs_density + beds_density_1000
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 139433
## 2 429 138602 1 831.22 2.5728 0.1095
Fail to reject H0, large model is not superior, we should keep the smaller one.
no poverty
fit_6 = lm(crm_1000 ~ pop18 + totalinc + region + pop_density + docs_density + beds_density_1000 + hsgrad, data = crime_df)
summary(fit_6)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + region + pop_density +
## docs_density + beds_density_1000 + hsgrad, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.197 -11.670 -0.842 10.300 76.758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.777e+01 1.265e+01 2.195 0.028675 *
## pop18 7.721e-01 2.290e-01 3.372 0.000814 ***
## totalinc 2.260e-04 7.447e-05 3.035 0.002551 **
## region2 1.154e+01 2.652e+00 4.349 1.71e-05 ***
## region3 2.973e+01 2.447e+00 12.147 < 2e-16 ***
## region4 2.749e+01 2.854e+00 9.631 < 2e-16 ***
## pop_density 5.292e-03 4.543e-04 11.650 < 2e-16 ***
## docs_density -9.572e+02 9.355e+02 -1.023 0.306767
## beds_density_1000 4.099e+00 7.064e-01 5.802 1.27e-08 ***
## hsgrad -3.849e-01 1.518e-01 -2.536 0.011564 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.49 on 430 degrees of freedom
## Multiple R-squared: 0.5514, Adjusted R-squared: 0.5421
## F-statistic: 58.74 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_6, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + totalinc + region + pop_density + docs_density +
## beds_density_1000 + hsgrad
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 147059
## 2 429 138602 1 8456.7 26.175 4.712e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reject H0, large model is superior
no totalinc
fit_7 = lm(crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density, data = crime_df)
summary(fit_7)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.252 -11.245 -0.797 9.991 75.301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.002e+01 1.615e+01 -1.240 0.215822
## pop18 4.302e-01 2.359e-01 1.824 0.068902 .
## poverty 1.545e+00 3.174e-01 4.868 1.58e-06 ***
## hsgrad 2.829e-01 2.046e-01 1.382 0.167557
## beds_density_1000 2.651e+00 7.286e-01 3.638 0.000308 ***
## docs_density -2.649e+00 9.121e+02 -0.003 0.997684
## region2 9.215e+00 2.657e+00 3.469 0.000576 ***
## region3 2.576e+01 2.526e+00 10.198 < 2e-16 ***
## region4 2.189e+01 3.092e+00 7.077 6.01e-12 ***
## pop_density 5.555e-03 4.311e-04 12.884 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.2 on 430 degrees of freedom
## Multiple R-squared: 0.5658, Adjusted R-squared: 0.5567
## F-statistic: 62.25 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_7, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 + docs_density +
## region + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 142362
## 2 429 138602 1 3759.8 11.637 0.0007077 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reject H0, large model is superior.
no pop18
fit_8 = lm(crm_1000 ~ poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density + totalinc, data = crime_df)
summary(fit_7)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.252 -11.245 -0.797 9.991 75.301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.002e+01 1.615e+01 -1.240 0.215822
## pop18 4.302e-01 2.359e-01 1.824 0.068902 .
## poverty 1.545e+00 3.174e-01 4.868 1.58e-06 ***
## hsgrad 2.829e-01 2.046e-01 1.382 0.167557
## beds_density_1000 2.651e+00 7.286e-01 3.638 0.000308 ***
## docs_density -2.649e+00 9.121e+02 -0.003 0.997684
## region2 9.215e+00 2.657e+00 3.469 0.000576 ***
## region3 2.576e+01 2.526e+00 10.198 < 2e-16 ***
## region4 2.189e+01 3.092e+00 7.077 6.01e-12 ***
## pop_density 5.555e-03 4.311e-04 12.884 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.2 on 430 degrees of freedom
## Multiple R-squared: 0.5658, Adjusted R-squared: 0.5567
## F-statistic: 62.25 on 9 and 430 DF, p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_8, fit_no_inte)
## Analysis of Variance Table
##
## Model 1: crm_1000 ~ poverty + hsgrad + beds_density_1000 + docs_density +
## region + pop_density + totalinc
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 +
## docs_density + region + pop_density
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 430 139639
## 2 429 138602 1 1037 3.2099 0.0739 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fail to reject H0, large model is not superior, we should keep the smaller one.
fiti <- lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_df)
summary(fiti)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + beds_density_1000 +
## region + pop_density + pop18 * pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.315 -11.345 -0.571 10.215 73.941
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.201e+01 6.699e+00 -1.792 0.07383 .
## pop18 8.962e-01 2.213e-01 4.050 6.08e-05 ***
## totalinc 2.018e-04 7.064e-05 2.856 0.00449 **
## poverty 1.208e+00 2.119e-01 5.700 2.23e-08 ***
## beds_density_1000 2.982e+00 4.866e-01 6.129 2.01e-09 ***
## region2 1.001e+01 2.482e+00 4.032 6.55e-05 ***
## region3 2.725e+01 2.443e+00 11.155 < 2e-16 ***
## region4 2.350e+01 2.841e+00 8.273 1.65e-15 ***
## pop_density 1.984e-02 3.469e-03 5.721 1.99e-08 ***
## pop18:pop_density -4.872e-04 1.135e-04 -4.293 2.18e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.63 on 430 degrees of freedom
## Multiple R-squared: 0.5921, Adjusted R-squared: 0.5836
## F-statistic: 69.36 on 9 and 430 DF, p-value: < 2.2e-16
interact_plot(fiti, pred = pop18, modx = pop_density)
## Warning: -1306.28434985964 is outside the observed range of pop_density
fiti <- lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome +
totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_df)
summary(fiti)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + bagrad + poverty + pcincome +
## totalinc + region + pop_density + beds_density_1000 + bagrad *
## pcincome + bagrad * pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.147 -10.177 -0.915 9.618 60.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.990e+01 1.647e+01 -5.457 8.23e-08 ***
## pop18 7.242e-01 2.751e-01 2.633 0.008778 **
## bagrad 1.867e+00 5.021e-01 3.718 0.000228 ***
## poverty 2.181e+00 2.906e-01 7.503 3.65e-13 ***
## pcincome 4.350e-03 7.999e-04 5.439 9.06e-08 ***
## totalinc 8.373e-05 7.336e-05 1.141 0.254364
## region2 1.216e+01 2.420e+00 5.024 7.46e-07 ***
## region3 2.912e+01 2.418e+00 12.044 < 2e-16 ***
## region4 2.447e+01 2.834e+00 8.634 < 2e-16 ***
## pop_density 9.032e-03 1.107e-03 8.161 3.76e-15 ***
## beds_density_1000 1.940e+00 4.973e-01 3.901 0.000111 ***
## bagrad:pcincome -1.006e-04 2.348e-05 -4.283 2.28e-05 ***
## bagrad:pop_density -1.960e-04 4.872e-05 -4.023 6.80e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.91 on 427 degrees of freedom
## Multiple R-squared: 0.6276, Adjusted R-squared: 0.6171
## F-statistic: 59.96 on 12 and 427 DF, p-value: < 2.2e-16
interact_plot(fiti, pred = bagrad, modx = pcincome)
interact_plot(fiti, pred = bagrad, modx = pop_density)
## Warning: -1306.28434985964 is outside the observed range of pop_density
Thus the four candidate regression model should be:
model_1 = lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_df)
model_2 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)
model_3 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_df)
Model1
bc <- boxcox(model_1, lambda = seq(-5, 5, by = 0.25))
lambda <- bc$x[which.max(bc$y)]
#choose power 1/2
crime_df$crm.bc <- crime_df$crm_1000^(1/2)
model_bc1 = lm(crm.bc ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_df)
par(mfrow = c(1,2))
plot(model_1, which = 2)
plot(model_bc1, which = 2)
summary(model_1)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + beds_density_1000 +
## region + pop_density + pop18 * pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.315 -11.345 -0.571 10.215 73.941
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.201e+01 6.699e+00 -1.792 0.07383 .
## pop18 8.962e-01 2.213e-01 4.050 6.08e-05 ***
## totalinc 2.018e-04 7.064e-05 2.856 0.00449 **
## poverty 1.208e+00 2.119e-01 5.700 2.23e-08 ***
## beds_density_1000 2.982e+00 4.866e-01 6.129 2.01e-09 ***
## region2 1.001e+01 2.482e+00 4.032 6.55e-05 ***
## region3 2.725e+01 2.443e+00 11.155 < 2e-16 ***
## region4 2.350e+01 2.841e+00 8.273 1.65e-15 ***
## pop_density 1.984e-02 3.469e-03 5.721 1.99e-08 ***
## pop18:pop_density -4.872e-04 1.135e-04 -4.293 2.18e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.63 on 430 degrees of freedom
## Multiple R-squared: 0.5921, Adjusted R-squared: 0.5836
## F-statistic: 69.36 on 9 and 430 DF, p-value: < 2.2e-16
summary(model_bc1)
##
## Call:
## lm(formula = crm.bc ~ pop18 + totalinc + poverty + beds_density_1000 +
## region + pop_density + pop18 * pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1476 -0.6532 0.0468 0.7784 4.4936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.944e+00 4.482e-01 6.570 1.46e-10 ***
## pop18 5.472e-02 1.480e-02 3.696 0.000247 ***
## totalinc 1.683e-05 4.726e-06 3.562 0.000410 ***
## poverty 7.121e-02 1.418e-02 5.022 7.49e-07 ***
## beds_density_1000 2.076e-01 3.255e-02 6.377 4.67e-10 ***
## region2 7.061e-01 1.660e-01 4.253 2.59e-05 ***
## region3 1.898e+00 1.634e-01 11.614 < 2e-16 ***
## region4 1.727e+00 1.900e-01 9.085 < 2e-16 ***
## pop_density 8.191e-04 2.320e-04 3.530 0.000460 ***
## pop18:pop_density -1.923e-05 7.592e-06 -2.534 0.011645 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.18 on 430 degrees of freedom
## Multiple R-squared: 0.5486, Adjusted R-squared: 0.5391
## F-statistic: 58.06 on 9 and 430 DF, p-value: < 2.2e-16
Model2
bc <- boxcox(model_2, lambda = seq(-5, 5, by = 0.25))
lambda <- bc$x[which.max(bc$y)]
#choose power 1/2
crime_df$crm.bc <- crime_df$crm_1000^(1/2)
model_bc2 = lm(crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)
par(mfrow = c(1,2))
plot(model_2, which = 2)
plot(model_bc2, which = 2)
summary(model_2)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + bagrad + poverty + pcincome +
## totalinc + region + pop_density + beds_density_1000, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.532 -11.110 -0.727 9.967 79.427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.959e+01 1.214e+01 -1.614 0.10728
## pop18 7.508e-01 2.864e-01 2.622 0.00906 **
## bagrad -2.184e-01 2.226e-01 -0.981 0.32709
## poverty 1.569e+00 2.849e-01 5.507 6.30e-08 ***
## pcincome 8.141e-04 4.547e-04 1.790 0.07413 .
## totalinc 1.886e-04 7.639e-05 2.469 0.01394 *
## region2 1.085e+01 2.563e+00 4.234 2.81e-05 ***
## region3 2.704e+01 2.544e+00 10.629 < 2e-16 ***
## region4 2.308e+01 2.980e+00 7.743 7.05e-14 ***
## pop_density 4.843e-03 4.491e-04 10.784 < 2e-16 ***
## beds_density_1000 2.577e+00 5.046e-01 5.108 4.92e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.96 on 429 degrees of freedom
## Multiple R-squared: 0.578, Adjusted R-squared: 0.5682
## F-statistic: 58.76 on 10 and 429 DF, p-value: < 2.2e-16
summary(model_bc2)
##
## Call:
## lm(formula = crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc +
## region + pop_density + beds_density_1000, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1813 -0.6894 0.0230 0.7638 4.0506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.465e+00 7.963e-01 1.840 0.06653 .
## pop18 6.222e-02 1.879e-02 3.311 0.00101 **
## bagrad -2.174e-02 1.460e-02 -1.489 0.13735
## poverty 1.043e-01 1.869e-02 5.579 4.30e-08 ***
## pcincome 8.347e-05 2.984e-05 2.797 0.00539 **
## totalinc 1.317e-05 5.013e-06 2.627 0.00893 **
## region2 7.888e-01 1.682e-01 4.690 3.68e-06 ***
## region3 1.935e+00 1.669e-01 11.589 < 2e-16 ***
## region4 1.766e+00 1.956e-01 9.029 < 2e-16 ***
## pop_density 2.128e-04 2.947e-05 7.222 2.35e-12 ***
## beds_density_1000 1.811e-01 3.311e-02 5.470 7.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.178 on 429 degrees of freedom
## Multiple R-squared: 0.5506, Adjusted R-squared: 0.5402
## F-statistic: 52.57 on 10 and 429 DF, p-value: < 2.2e-16
Model3
bc <- boxcox(model_3, lambda = seq(-5, 5, by = 0.25))
lambda <- bc$x[which.max(bc$y)]
#choose power 1/2
crime_df$crm.bc <- crime_df$crm_1000^(1/2)
model_bc3 = lm(crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_df)
par(mfrow = c(1,2))
plot(model_3, which = 2)
plot(model_bc3, which = 2)
summary(model_3)
##
## Call:
## lm(formula = crm_1000 ~ pop18 + bagrad + poverty + pcincome +
## totalinc + region + pop_density + beds_density_1000 + bagrad *
## pcincome + bagrad * pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.147 -10.177 -0.915 9.618 60.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.990e+01 1.647e+01 -5.457 8.23e-08 ***
## pop18 7.242e-01 2.751e-01 2.633 0.008778 **
## bagrad 1.867e+00 5.021e-01 3.718 0.000228 ***
## poverty 2.181e+00 2.906e-01 7.503 3.65e-13 ***
## pcincome 4.350e-03 7.999e-04 5.439 9.06e-08 ***
## totalinc 8.373e-05 7.336e-05 1.141 0.254364
## region2 1.216e+01 2.420e+00 5.024 7.46e-07 ***
## region3 2.912e+01 2.418e+00 12.044 < 2e-16 ***
## region4 2.447e+01 2.834e+00 8.634 < 2e-16 ***
## pop_density 9.032e-03 1.107e-03 8.161 3.76e-15 ***
## beds_density_1000 1.940e+00 4.973e-01 3.901 0.000111 ***
## bagrad:pcincome -1.006e-04 2.348e-05 -4.283 2.28e-05 ***
## bagrad:pop_density -1.960e-04 4.872e-05 -4.023 6.80e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.91 on 427 degrees of freedom
## Multiple R-squared: 0.6276, Adjusted R-squared: 0.6171
## F-statistic: 59.96 on 12 and 427 DF, p-value: < 2.2e-16
summary(model_bc3)
##
## Call:
## lm(formula = crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc +
## region + pop_density + beds_density_1000 + bagrad * pcincome +
## bagrad * pop_density, data = crime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0257 -0.6925 0.0322 0.6745 3.5136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.173e+00 1.098e+00 -2.889 0.004062 **
## pop18 5.539e-02 1.834e-02 3.021 0.002672 **
## bagrad 1.301e-01 3.347e-02 3.887 0.000118 ***
## poverty 1.459e-01 1.937e-02 7.533 2.99e-13 ***
## pcincome 3.282e-04 5.332e-05 6.156 1.72e-09 ***
## totalinc 6.965e-06 4.890e-06 1.424 0.155068
## region2 8.582e-01 1.613e-01 5.320 1.68e-07 ***
## region3 2.033e+00 1.612e-01 12.612 < 2e-16 ***
## region4 1.809e+00 1.889e-01 9.573 < 2e-16 ***
## pop_density 3.613e-04 7.377e-05 4.898 1.38e-06 ***
## beds_density_1000 1.338e-01 3.315e-02 4.036 6.45e-05 ***
## bagrad:pcincome -7.504e-06 1.565e-06 -4.794 2.26e-06 ***
## bagrad:pop_density -6.847e-06 3.248e-06 -2.108 0.035588 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.127 on 427 degrees of freedom
## Multiple R-squared: 0.5907, Adjusted R-squared: 0.5792
## F-statistic: 51.36 on 12 and 427 DF, p-value: < 2.2e-16
plot(model_1, which = 1)
plot(model_2, which = 1)
plot(model_3, which = 1)
observation #6, #215
Filter out influential observations
plot(model_1, which = 2)
plot(model_2, which = 2)
plot(model_3, which = 2)
The observation #6, #215, #123is an outlier in all three
plot(model_1, which = 3)
plot(model_2, which = 3)
plot(model_3, which = 3)
The observation #123, #6, #215 is identified
plot(model_1, which = 5)
plot(model_2, which = 5)
plot(model_3, which = 5)
observation #1, #6
plot(model_1, which = 6)
plot(model_2, which = 6)
plot(model_3, which = 6)
#1, #123, #6
The observation #6 and #1, is beyond the Cook’s distance lines of 0.5 (> 1). The plot identified the influential observation as #6, #1
crime_df %>%
filter(row(crime_df) == 6 | row(crime_df) == 215 )
## # A tibble: 2 × 16
## crm_1000 cty state pop18 pop65 hsgrad bagrad poverty unemp pcincome totalinc
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 296. Kings NY 28.3 12.4 63.7 16.6 19.5 9.5 16803 38658
## 2 112. Atla… NJ 29 14.5 72.9 16.4 6.4 8.3 24035 5392
## # … with 5 more variables: region <fct>, pop_density <dbl>,
## # beds_density_1000 <dbl>, docs_density <dbl>, crm.bc <dbl>
crime_after <- crime_df %>%
filter(row(crime_df) != 6)
# Correlation matrix for all variables
temp <- crime_after %>%
dplyr::select(-state, -cty, -region)
cor(temp)
## crm_1000 pop18 pop65 hsgrad bagrad
## crm_1000 1.00000000 0.21111276 -0.07448047 -0.20671429 0.05501983
## pop18 0.21111276 1.00000000 -0.61630643 0.25141951 0.45619208
## pop65 -0.07448047 -0.61630643 1.00000000 -0.26919500 -0.33928575
## hsgrad -0.20671429 0.25141951 -0.26919500 1.00000000 0.70858677
## bagrad 0.05501983 0.45619208 -0.33928575 0.70858677 1.00000000
## poverty 0.47132275 0.03452596 0.00631249 -0.68859004 -0.40799186
## unemp 0.01882935 -0.27883812 0.23656367 -0.59167432 -0.54041041
## pcincome -0.07881273 -0.03171871 0.01865180 0.52349172 0.69520378
## totalinc 0.19993547 0.07198201 -0.02319956 0.05473613 0.22699684
## pop_density 0.29355413 0.17535045 0.03751855 -0.05416937 0.24037288
## beds_density_1000 0.39845317 0.02954235 0.24713716 -0.21157574 -0.04527816
## docs_density 0.33806678 0.23702815 0.01860966 0.14338706 0.44121009
## crm.bc 0.98705951 0.20599865 -0.07654989 -0.19181575 0.05951478
## poverty unemp pcincome totalinc pop_density
## crm_1000 0.47132275 0.01882935 -0.07881273 0.199935465 0.29355413
## pop18 0.03452596 -0.27883812 -0.03171871 0.071982012 0.17535045
## pop65 0.00631249 0.23656367 0.01865180 -0.023199555 0.03751855
## hsgrad -0.68859004 -0.59167432 0.52349172 0.054736134 -0.05416937
## bagrad -0.40799186 -0.54041041 0.69520378 0.226996843 0.24037288
## poverty 1.00000000 0.43380542 -0.60326536 -0.052025537 0.07001134
## unemp 0.43380542 1.00000000 -0.32155149 -0.040991522 -0.02478102
## pcincome -0.60326536 -0.32155149 1.00000000 0.352424960 0.34018837
## totalinc -0.05202554 -0.04099152 0.35242496 1.000000000 0.32911874
## pop_density 0.07001134 -0.02478102 0.34018837 0.329118735 1.00000000
## beds_density_1000 0.37306692 -0.06293606 -0.05344502 0.005714199 0.27840087
## docs_density 0.06413325 -0.24830541 0.36011639 0.200450879 0.43747995
## crm.bc 0.45355488 0.01411476 -0.07597748 0.198626376 0.27465972
## beds_density_1000 docs_density crm.bc
## crm_1000 0.398453170 0.33806678 0.98705951
## pop18 0.029542349 0.23702815 0.20599865
## pop65 0.247137160 0.01860966 -0.07654989
## hsgrad -0.211575741 0.14338706 -0.19181575
## bagrad -0.045278162 0.44121009 0.05951478
## poverty 0.373066922 0.06413325 0.45355488
## unemp -0.062936062 -0.24830541 0.01411476
## pcincome -0.053445023 0.36011639 -0.07597748
## totalinc 0.005714199 0.20045088 0.19862638
## pop_density 0.278400865 0.43747995 0.27465972
## beds_density_1000 1.000000000 0.66670719 0.36861627
## docs_density 0.666707186 1.00000000 0.31512464
## crm.bc 0.368616267 0.31512464 1.00000000
crime_after %>%
dplyr::select(-crm_1000) %>%
ggcorr(label = TRUE, label_size = 2, hjust = 0.8)
## Warning in ggcorr(., label = TRUE, label_size = 2, hjust = 0.8): data in
## column(s) 'cty', 'state', 'region' are not numeric and were ignored
The correlation plot suggest high correlation between beds and docs, beds and totalinc, docs and totalinc.
Let’s check if the model violates colienarity
Calculate VIF w/o interaction terms
model_1_new = lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density, data = crime_after)
model_2_new = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome +
totalinc + region + pop_density + beds_density_1000, data = crime_after)
model_3_new = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome +
totalinc + region + pop_density + beds_density_1000, data = crime_after)
check_collinearity(model_1_new)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop18 1.04 1.02 0.96
## totalinc 1.00 1.00 1.00
## poverty 1.36 1.17 0.74
## beds_density_1000 1.36 1.17 0.73
## region 1.31 1.14 0.77
## pop_density 1.11 1.05 0.90
check_collinearity(model_2_new)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop18 1.97 1.40 0.51
## bagrad 2.12 1.46 0.47
## poverty 1.35 1.16 0.74
## pcincome 1.10 1.05 0.91
## totalinc 1.00 1.00 1.00
## region 1.34 1.16 0.74
## pop_density 1.05 1.03 0.95
## beds_density_1000 1.32 1.15 0.76
check_collinearity(model_3_new)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF Increased SE Tolerance
## pop18 1.97 1.40 0.51
## bagrad 2.12 1.46 0.47
## poverty 1.35 1.16 0.74
## pcincome 1.10 1.05 0.91
## totalinc 1.00 1.00 1.00
## region 1.34 1.16 0.74
## pop_density 1.05 1.03 0.95
## beds_density_1000 1.32 1.15 0.76
There is no multicolinearity in our models
model_1 = lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_after)
model_2 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome +
totalinc + region + pop_density + beds_density_1000, data = crime_after)
model_3 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome +
totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_after)
model_1 %>% broom::tidy()
## # A tibble: 10 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -8.06 6.62 -1.22 2.24e- 1
## 2 pop18 0.781 0.218 3.58 3.90e- 4
## 3 totalinc 0.000282 0.0000715 3.94 9.64e- 5
## 4 poverty 1.14 0.208 5.47 7.65e- 8
## 5 beds_density_1000 3.40 0.486 7.00 1.01e-11
## 6 region2 9.62 2.43 3.96 8.92e- 5
## 7 region3 26.7 2.40 11.1 1.77e-25
## 8 region4 22.7 2.79 8.16 3.88e-15
## 9 pop_density 0.00763 0.00439 1.74 8.30e- 2
## 10 pop18:pop_density -0.000157 0.000134 -1.17 2.44e- 1
model_2 %>% broom::tidy()
## # A tibble: 11 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -43.8 12.0 -3.64 3.03e- 4
## 2 pop18 1.12 0.277 4.06 5.81e- 5
## 3 bagrad -0.406 0.213 -1.91 5.71e- 2
## 4 poverty 1.74 0.271 6.41 3.85e-10
## 5 pcincome 0.00165 0.000447 3.69 2.55e- 4
## 6 totalinc 0.000211 0.0000725 2.91 3.80e- 3
## 7 region2 11.1 2.43 4.56 6.55e- 6
## 8 region3 27.8 2.41 11.5 6.09e-27
## 9 region4 24.1 2.83 8.52 2.77e-16
## 10 pop_density 0.00164 0.000625 2.62 9.09e- 3
## 11 beds_density_1000 3.19 0.486 6.55 1.61e-10
model_3 %>% broom::tidy()
## # A tibble: 13 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -110. 16.4 -6.73 5.65e-11
## 2 pop18 0.919 0.269 3.42 6.86e- 4
## 3 bagrad 2.05 0.487 4.21 3.07e- 5
## 4 poverty 2.36 0.283 8.32 1.21e-15
## 5 pcincome 0.00535 0.000796 6.72 5.87e-11
## 6 totalinc 0.000126 0.0000715 1.76 7.96e- 2
## 7 region2 11.9 2.34 5.07 5.89e- 7
## 8 region3 28.7 2.34 12.3 8.86e-30
## 9 region4 24.0 2.75 8.75 5.17e-17
## 10 pop_density 0.00255 0.00161 1.59 1.13e- 1
## 11 beds_density_1000 2.35 0.488 4.83 1.94e- 6
## 12 bagrad:pcincome -0.000123 0.0000231 -5.31 1.77e- 7
## 13 bagrad:pop_density -0.0000245 0.0000568 -0.431 6.67e- 1
set.seed(7)
cv_df =
crossv_mc(crime_df, 100) %>%
mutate(
train = map(train, as_tibble),
test = map(test, as_tibble)
) %>%
mutate(
model_1 = map(train, ~lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = .x)),
model_2 = map(train, ~lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = .x)),
model_3 = map(train, ~lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = .x))
) %>%
mutate(
rmse_1 = map2_dbl(model_1, test, ~rmse(model = .x, data = .y)),
rmse_2 = map2_dbl(model_2, test, ~rmse(model = .x, data = .y)),
rmse_3 = map2_dbl(model_3, test, ~rmse(model = .x, data = .y))
)
cv_df
## # A tibble: 100 × 9
## train test .id model_1 model_2 model_3 rmse_1 rmse_2 rmse_3
## <list> <list> <chr> <list> <list> <list> <dbl> <dbl> <dbl>
## 1 <tibble [351 × 16]> <tibb… 001 <lm> <lm> <lm> 15.4 15.9 15.5
## 2 <tibble [351 × 16]> <tibb… 002 <lm> <lm> <lm> 17.9 18.4 18.3
## 3 <tibble [351 × 16]> <tibb… 003 <lm> <lm> <lm> 19.3 23.8 21.0
## 4 <tibble [351 × 16]> <tibb… 004 <lm> <lm> <lm> 20.7 20.4 20.0
## 5 <tibble [351 × 16]> <tibb… 005 <lm> <lm> <lm> 20.4 24.0 22.8
## 6 <tibble [351 × 16]> <tibb… 006 <lm> <lm> <lm> 18.4 17.9 17.5
## 7 <tibble [351 × 16]> <tibb… 007 <lm> <lm> <lm> 21.8 23.4 20.6
## 8 <tibble [351 × 16]> <tibb… 008 <lm> <lm> <lm> 18.3 18.5 17.1
## 9 <tibble [351 × 16]> <tibb… 009 <lm> <lm> <lm> 16.4 16.5 15.3
## 10 <tibble [351 × 16]> <tibb… 010 <lm> <lm> <lm> 20.4 25.7 22.9
## # … with 90 more rows
cv_df %>%
dplyr::select(rmse_1:rmse_3) %>%
pivot_longer(
rmse_1:rmse_3,
names_to = "model",
values_to = "rmse",
names_prefix = "rmse_"
) %>%
ggplot(aes(x = model, y = rmse)) +
geom_boxplot() +
labs(
x = "Model",
y = "RMSE"
)
set.seed(2021)
# Use 5-fold validation and create the training sets
train = trainControl(method = "repeatedcv", number = 5, repeats = 100)
# Fit the 4-variables model that we discussed in previous lectures
model_caret_1 =
train(
crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density,
data = crime_df,
trControl = train,
method = 'lm',
na.action = na.pass
)
model_caret_2 =
train(
crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000,
data = crime_df,
trControl = train,
method = 'lm',
na.action = na.pass
)
model_caret_3 =
train(
crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density,
data = crime_df,
trControl = train,
method = 'lm',
na.action = na.pass
)
bind_rows(
as_tibble(model_caret_1$results),
as_tibble(model_caret_2$results),
as_tibble(model_caret_3$results),
) %>%
knitr::kable(digit = 3)
| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 18.922 | 0.518 | 13.999 | 2.415 | 0.099 | 1.137 |
| TRUE | 19.747 | 0.489 | 14.155 | 3.382 | 0.070 | 1.176 |
| TRUE | 18.520 | 0.545 | 13.290 | 2.925 | 0.073 | 1.119 |